In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
In [2]:
%load ../ud120-projects/final_project/poi_id.py
In [1]:
#%%writefile ../ud120-projects/final_project/poi_id.py
#!/usr/bin/python
import matplotlib.pyplot as plt
import sys
import pickle
sys.path.append("../ud120-projects/tools/")
from feature_format import featureFormat
from feature_format import targetFeatureSplit
### features_list is a list of strings, each of which is a feature name
### first feature must be "poi", as this will be singled out as the label
features_list = ['poi', 'salary', 'deferral_payments', 'total_payments', 'loan_advances',
'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value',
'expenses', 'exercised_stock_options', 'other', 'long_term_incentive',
'restricted_stock', 'director_fees', 'to_messages',
'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi',
'poi', 'shared_receipt_with_poi']
### load the dictionary containing the dataset
data_dict = pickle.load(open("../ud120-projects/final_project/final_project_dataset.pkl", "r") )
### we suggest removing any outliers before proceeding further
### if you are creating any new features, you might want to do that here
### store to my_dataset for easy export below
my_dataset = data_dict
### these two lines extract the features specified in features_list
### and extract them from data_dict, returning a numpy array
data = featureFormat(my_dataset, features_list)
### if you are creating new features, could also do that here
### split into labels and features (this line assumes that the first
### feature in the array is the label, which is why "poi" must always
### be first in features_list
labels, features = targetFeatureSplit(data)
### machine learning goes here!
### please name your classifier clf for easy export below
clf = None ### get rid of this line! just here to keep code from crashing out-of-box
### dump your classifier, dataset and features_list so
### anyone can run/check your results
pickle.dump(clf, open("../ud120-projects/final_project/my_classifier.pkl", "w") )
pickle.dump(data_dict, open("../ud120-projects/final_project/my_dataset.pkl", "w") )
pickle.dump(features_list, open("../ud120-projects/final_project/my_feature_list.pkl", "w") )
In [11]:
In [ ]:
#%load ../ud120-projects/final_project/tester.py
In [ ]:
#!/usr/bin/pickle
""" a basic script for importing student's POI identifier,
and checking the results that they get from it
requires that the algorithm, dataset, and features list
be written to my_classifier.pkl, my_dataset.pkl, and
my_feature_list.pkl, respectively
that process should happen at the end of poi_id.py
"""
import pickle
import sys
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
### load up student's classifier, dataset, and feature_list
clf = pickle.load(open("my_classifier.pkl", "r") )
dataset = pickle.load(open("my_dataset.pkl", "r") )
feature_list = pickle.load(open("my_feature_list.pkl", "r"))
### print basic info about the algorithm/parameters used
print clf
### prepare data for training/testing
data = featureFormat(dataset, feature_list)
labels, features = targetFeatureSplit(data)
### stratified k-fold cross-validation is a form of
### CV where instances of each class are equally apportioned--
### e.g. if you have 10% of one class and 90% of the other,
### stratification means each fold will have 10% of one
### class and 90% of the other
###
### this is helpful when you don't have a lot of instances
### of one class or the other, because in that case the
### low-frequency class can become lopsided in the training-test
### split skew the results
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.cross_validation import StratifiedKFold
skf = StratifiedKFold( labels, n_folds=3 )
precisions = []
recalls = []
for train_idx, test_idx in skf:
features_train = []
features_test = []
labels_train = []
labels_test = []
for ii in train_idx:
features_train.append( features[ii] )
labels_train.append( labels[ii] )
for jj in test_idx:
features_test.append( features[jj] )
labels_test.append( labels[jj] )
### fit the classifier using training set, and test on test set
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
### for each fold, print some metrics
print
print "precision score: ", precision_score( labels_test, pred )
print "recall score: ", recall_score( labels_test, pred )
precisions.append( precision_score(labels_test, pred) )
recalls.append( recall_score(labels_test, pred) )
### aggregate precision and recall over all folds
print "average precision: ", sum(precisions)/3.
print "average recall: ", sum(recalls)/3.
#print precision_score( labels_test, pred )
#print recall_score( labels_test, pred )
In [4]:
data_dict = pickle.load(open("../ud120-projects/final_project/my_dataset.pkl", "r") )
In [56]:
#[v for k,v in data_dict.items()][0]
In [5]:
data_dict.items()[0]
Out[5]:
In [6]:
df = pd.DataFrame.from_dict(my_dataset, orient='index')
In [7]:
#%load ../ud120-projects/tools/feature_format.py
In [8]:
df.head()
Out[8]:
In [9]:
df['salary'].unique()
Out[9]:
'NaN' was imported as a string instead of a a missing value. We will convert these to NaN type and look how many missing values our data has.
In [10]:
df = df.replace('NaN', np.nan)
In [11]:
df.info()
There is a lot of missing data!
In [12]:
print "NaN - Missing values:"
len(df.index)-df.count()
Out[12]:
First, check for potential invalid people in the dataset by looking at names without a " ".
In [13]:
[suspect for suspect in df.index if " " not in suspect]
Out[13]:
TOTAL is an aggregate category, and not a person's name. This should be removed.
In [14]:
df = df.drop('TOTAL', axis=0)
Next, we'll look at names of people who only have 3 or less feature entries (one of which is simply True/False for poi and not a feature) out of 21 features. One happens to be a Travel Agency, and others are missing nearly all entries as well.
These are good candidates for potential removal.
In [15]:
print [ind for ind in enumerate(df.T.count()) if ind[1] <= 3]
df.irow([56, 84, 127, 137, 142])
Out[15]:
In [16]:
#df.columns
#df = df.drop(['Name'], axis=1)
df = df.drop(['GRAMM WENDY L', 'THE TRAVEL AGENCY IN THE PARK', 'LOCKHART EUGENE E', 'WHALEY DAVID A', 'WROBEL BRUCE'], axis=0)
Email address is also not needed for this model as it is a unique string for each person.
In [17]:
df = df.drop(['email_address'], axis=1)
First, we must deal with the NaN's since many models don't like missing values. For a quick and dirty solution, we will just fill in 0's for missing values.
This is just to get a model up and running, and will be handled differently later.
In [18]:
from sklearn.cross_validation import train_test_split
In [98]:
labels = df['poi']
features = df.drop('poi', axis=1)
features_train, features_test, labels_train, labels_test = train_test_split(features, labels,
test_size=0.2,
random_state=808)
In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.grid_search import GridSearchCV
In [21]:
#param_grid = [{'C':[.0001, .001, .01, 0.1, 1, 10, 100, 1000], 'gamma': [10, 1, .1, .01, .001, .0001]}]
In [116]:
from sklearn.cross_validation import StratifiedShuffleSplit
In [122]:
train_strat, test_strat = StratifiedShuffleSplit(df.poi, n_iter=10, test_size=0.2)
In [123]:
In [ ]:
In [23]:
#svm_model = SVC()
#clf = grid_search.GridSearchCV(svm_model, param_grid, n_jobs=4, scoring='f1')
In [23]:
In [25]:
#NaN, can't fit yet
#clf.fit(features_train, labels_train)
In [27]:
#rf = RandomForestClassifier(n_estimators=1000, n_jobs=4)
In [29]:
#NaN, can't fit yet
#rf.fit(features_train, labels_train)
In [93]:
In [34]:
#pred = rf.predict(features_test)
#print "Accuracy:", accuracy_score(labels_test, pred), '\n'
#print "Confusion Matrix:\n", confusion_matrix(labels_test, pred), '\n'
#print "Classification Report:", classification_report(labels_test, pred)
In [32]:
#features = np.array(features)
#labels = np.array(labels)
#features_test = np.array(features_test)
#features_train = np.array(features_train)
#labels_test = np.array(labels_test)
#labels_train = np.array(labels_train)
In [107]:
Out[107]:
In [124]:
In [94]:
df.head()
Out[94]:
In [39]:
#df[(df.poi == True)].email_address
In [34]:
df.info()
In [46]:
df.describe()
Out[46]:
In [47]:
import matplotlib.pyplot as plt
In [35]:
plt.plot(df.salary.fillna(df.to_messages.median()))
plt.plot(df[df.poi==True].salary)
Out[35]:
In [78]:
df.apply(lambda x: x.fillna(x.median()), axis=0).describe()
Out[78]:
In [74]:
df.info()
In [101]:
plt.plot(df.long_term_incentive, 'ro')
plt.plot(df[df.poi==True].long_term_incentive, 'bo')
Out[101]:
In [36]:
df1 = df.drop(['deferral_payments', 'restricted_stock_deferred', 'loan_advances', 'director_fees'], axis=1)
In [37]:
f1 = df1.drop(['poi'], axis=1)
y1 = df['poi']
In [38]:
from sklearn.preprocessing import scale
We need to fill in the missing values.
In [40]:
f1 = f1.apply(lambda x: x.fillna(x.median()), axis=0)
f_scaled = scale(f1)
In [41]:
from sklearn.decomposition import PCA
In [42]:
# Keep the number of components that capture 90% of the variance.
pca = PCA(n_components=0.90, whiten=True).fit(f_scaled)
In [43]:
# There are 9 principal components that capture at least 90% of the data.
# This is a reduction from
pca.n_components_
Out[43]:
In [228]:
pca.explained_variance_ratio_
Out[228]:
In [44]:
# Exact total variance captured by 9 principal components is 92.2%
sum(pca.explained_variance_ratio_)
Out[44]:
In [229]:
In [45]:
# Use the pca model to transform our training and testing set features.
x_pca = pca.transform(f_scaled)
In [216]:
In [56]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
param_grid = {
'C': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, .1, 1, 10, 100],
'gamma': [1e-300, 1e-200, 1e-100, 1e-30, 1e-20, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7],
}
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid,n_jobs=4)
clf = clf.fit(x_pca, y1)
In [57]:
print clf.best_estimator_
In [57]:
In [57]:
In [59]:
# SVC/SVM
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
y_pred = clf.predict(x_pca)
print classification_report(y1, y_pred)
print confusion_matrix(y1, y_pred)
print accuracy_score(y1, y_pred)
In [90]:
#Random Forest
rf = RandomForestClassifier(n_estimators=2000, n_jobs=4, oob_score=True)
rf.fit(x_pca, y1)
y_pred = rf.predict(x_pca)
print classification_report(y1, y_pred)
print confusion_matrix(y1, y_pred)
print accuracy_score(y1, y_pred)
In [72]:
from operator import itemgetter
# Utility function to report best scores
def report(grid_scores, n_top=3):
top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
for i, score in enumerate(top_scores):
print("Model with rank: {0}".format(i + 1))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
score.mean_validation_score,
np.std(score.cv_validation_scores)))
print("Parameters: {0}".format(score.parameters))
print("")
In [83]:
# !!
# Use oob_score=True and bootstrap=True for OOB estimates for random forests.
# !!
clf = RandomForestClassifier(n_estimators=2000, oob_score=True)
In [84]:
# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
"max_features": [1, 3, 9],
"min_samples_split": [1, 3, 10],
"min_samples_leaf": [1, 3, 10],
"criterion": ["gini", "entropy"]}
In [85]:
# run grid search
# n_jobs = 4 for parallel processing with 4-cores.
grid_search = GridSearchCV(clf, param_grid=param_grid, pre_dispatch=8, n_jobs=4)
In [85]:
In [86]:
grid_search.fit(x_pca, y1)
Out[86]:
In [87]:
report(grid_search.grid_scores_)
In [88]:
clf = grid_search.best_estimator_
clf
Out[88]:
In [91]:
features = x_pca.copy()
labels = y1.copy()
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
skf = StratifiedKFold( labels, n_folds=3 )
precisions = []
recalls = []
for train_idx, test_idx in skf:
features_train = []
features_test = []
labels_train = []
labels_test = []
for ii in train_idx:
features_train.append( features[ii] )
labels_train.append( labels[ii] )
for jj in test_idx:
features_test.append( features[jj] )
labels_test.append( labels[jj] )
### fit the classifier using training set, and test on test set
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
### for each fold, print some metrics
print
print "precision score: ", precision_score( labels_test, pred )
print "recall score: ", recall_score( labels_test, pred )
print "confusion matrix\n", confusion_matrix(labels_test, pred)
precisions.append( precision_score(labels_test, pred) )
recalls.append( recall_score(labels_test, pred) )
### aggregate precision and recall over all folds
print "average precision: ", sum(precisions)/3.
print "average recall: ", sum(recalls)/3.
In [159]:
Out[159]:
In [ ]: